#load packages
library(ggplot2)
library(gridExtra)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(reshape2)
#load data
df <- read.csv('data/data_breach.csv')
head(df)
## Date.Made.Public Company State Type.of.breach
## 1 2009-10-21 Bullitt County Public Schools Kentucky DISC
## 2 2009-10-21 Roane State Community College Tennessee PORT
## 3 2009-10-15 Halifax Health Florida PORT
## 4 2009-10-04 Suffolk Community College New York DISC
## 5 2009-09-28 Penrose Hospital Colorado PHYS
## 6 2009-09-23 Eastern Kentucky University Kentucky DISC
## Type.of.organization Total.Records
## 1 EDU 676
## 2 EDU 14783
## 3 MED 33000
## 4 EDU 300
## 5 MED 175
## 6 EDU 5045
## Description.of.incident
## 1 A Bullitt County Public Schools \n employee accidentally sent an e-mail message to about 1,800 school \n district workers that included the names and Social Security numbers \n of 676 district employees. The employees were identified as not having \n completed the district's 2010 open-enrollment process for insurance, \n and the e-mail was intended as a reminder to complete the process.
## 2 Roane State Community College \n has announced that the names and Social Security numbers of 9,747 \n current or former students were on a data storage device stolen from \n an employee's vehicle, along with 1,194 current/former employees' \n information. The Social Security numbers alone, with no names, were \n also stolen for 5,036 additional current or former students. The data \n was on a 4GB USB drive used for work-related purposes. An employee \n took it home to do work after hours, and left it in the car. The employee \n forgot to lock the car doors. The USB drive was stolen along with \n a personal hand-held device.
## 3 A laptop computer from a Halifax \n Health employee's vehicle in Orange County was stolen -- which might \n have contained password protected patient information.
## 4 Suffolk Community College has \n agreed to pay a company for the next year to monitor the credit of \n 300 students whose last names and Social Security numbers were mistakenly \n listed in an attachment to an e-mail sent to those students last month.
## 5 Officials at Penrose Hospital believe someone has stolen the personal information of 175 patients. The missing information consists of names, addresses, phone numbers, Social Security numbers and the reason for the patients' visits. The information was stored on a computer print-out and kept in a binder stored in a cabinet. The print out has gone missing.
## 6 The names and Social Security \n numbers of about 5,000 Eastern Kentucky University faculty, staff \n and student workers were posted inadvertently on the Internet last \n September, where they have been displayed for a year.
## Information.Source Source.URL Year.of.Breach Latitude Longitude
## 1 Dataloss DB 2009 37.98840 -85.71579
## 2 Dataloss DB 2009 35.93396 -84.55244
## 3 Dataloss DB 2009 29.21082 -81.02283
## 4 Dataloss DB 2009 40.86649 -73.03566
## 5 Dataloss DB 2009 38.83388 -104.82136
## 6 Dataloss DB 2009 37.74786 -84.29465
## breach_type_gA breach_type_gB breach_type_gC breach_type_gD region_A
## 1 Others Insider Others Network South
## 2 Others Mixed Offline Local South
## 3 Others Mixed Offline Local South
## 4 Others Insider Others Network Northeast
## 5 Others Mixed Offline Local West
## 6 Others Insider Others Network South
## region_B age_of_Law org_group records_group records_index
## 1 Southeast 0 F_regulated Others 0.0008129202
## 2 Southeast 14 F_regulated Others 0.0177772189
## 3 Southeast 14 F_regulated Others 0.0396839765
## 4 Northeast 14 F_regulated Others 0.0003607634
## 5 West 13 F_regulated Others 0.0002104453
## 6 Southeast 0 F_regulated Others 0.0060668382
## records_scale type_scale org_scale severity_index1 severity_index2
## 1 10 1 8 11 18
## 2 10 7 8 17 18
## 3 10 7 10 17 20
## 4 10 1 8 11 18
## 5 10 7 10 17 20
## 6 10 1 8 11 18
## severity_index3 severity_index4 severity_index5
## 1 19 6.333333 22.333333
## 2 25 8.333333 2.333333
## 3 27 9.000000 3.000000
## 4 19 6.333333 22.333333
## 5 27 9.000000 3.000000
## 6 19 6.333333 22.333333
# subset of data breaches in US, excluing those ouside of US.
df <- subset(df, region_A != 'Non_US')
head(df)
## Date.Made.Public Company State Type.of.breach
## 1 2009-10-21 Bullitt County Public Schools Kentucky DISC
## 2 2009-10-21 Roane State Community College Tennessee PORT
## 3 2009-10-15 Halifax Health Florida PORT
## 4 2009-10-04 Suffolk Community College New York DISC
## 5 2009-09-28 Penrose Hospital Colorado PHYS
## 6 2009-09-23 Eastern Kentucky University Kentucky DISC
## Type.of.organization Total.Records
## 1 EDU 676
## 2 EDU 14783
## 3 MED 33000
## 4 EDU 300
## 5 MED 175
## 6 EDU 5045
## Description.of.incident
## 1 A Bullitt County Public Schools \n employee accidentally sent an e-mail message to about 1,800 school \n district workers that included the names and Social Security numbers \n of 676 district employees. The employees were identified as not having \n completed the district's 2010 open-enrollment process for insurance, \n and the e-mail was intended as a reminder to complete the process.
## 2 Roane State Community College \n has announced that the names and Social Security numbers of 9,747 \n current or former students were on a data storage device stolen from \n an employee's vehicle, along with 1,194 current/former employees' \n information. The Social Security numbers alone, with no names, were \n also stolen for 5,036 additional current or former students. The data \n was on a 4GB USB drive used for work-related purposes. An employee \n took it home to do work after hours, and left it in the car. The employee \n forgot to lock the car doors. The USB drive was stolen along with \n a personal hand-held device.
## 3 A laptop computer from a Halifax \n Health employee's vehicle in Orange County was stolen -- which might \n have contained password protected patient information.
## 4 Suffolk Community College has \n agreed to pay a company for the next year to monitor the credit of \n 300 students whose last names and Social Security numbers were mistakenly \n listed in an attachment to an e-mail sent to those students last month.
## 5 Officials at Penrose Hospital believe someone has stolen the personal information of 175 patients. The missing information consists of names, addresses, phone numbers, Social Security numbers and the reason for the patients' visits. The information was stored on a computer print-out and kept in a binder stored in a cabinet. The print out has gone missing.
## 6 The names and Social Security \n numbers of about 5,000 Eastern Kentucky University faculty, staff \n and student workers were posted inadvertently on the Internet last \n September, where they have been displayed for a year.
## Information.Source Source.URL Year.of.Breach Latitude Longitude
## 1 Dataloss DB 2009 37.98840 -85.71579
## 2 Dataloss DB 2009 35.93396 -84.55244
## 3 Dataloss DB 2009 29.21082 -81.02283
## 4 Dataloss DB 2009 40.86649 -73.03566
## 5 Dataloss DB 2009 38.83388 -104.82136
## 6 Dataloss DB 2009 37.74786 -84.29465
## breach_type_gA breach_type_gB breach_type_gC breach_type_gD region_A
## 1 Others Insider Others Network South
## 2 Others Mixed Offline Local South
## 3 Others Mixed Offline Local South
## 4 Others Insider Others Network Northeast
## 5 Others Mixed Offline Local West
## 6 Others Insider Others Network South
## region_B age_of_Law org_group records_group records_index
## 1 Southeast 0 F_regulated Others 0.0008129202
## 2 Southeast 14 F_regulated Others 0.0177772189
## 3 Southeast 14 F_regulated Others 0.0396839765
## 4 Northeast 14 F_regulated Others 0.0003607634
## 5 West 13 F_regulated Others 0.0002104453
## 6 Southeast 0 F_regulated Others 0.0060668382
## records_scale type_scale org_scale severity_index1 severity_index2
## 1 10 1 8 11 18
## 2 10 7 8 17 18
## 3 10 7 10 17 20
## 4 10 1 8 11 18
## 5 10 7 10 17 20
## 6 10 1 8 11 18
## severity_index3 severity_index4 severity_index5
## 1 19 6.333333 22.333333
## 2 25 8.333333 2.333333
## 3 27 9.000000 3.000000
## 4 19 6.333333 22.333333
## 5 27 9.000000 3.000000
## 6 19 6.333333 22.333333
# extraordinary incidents
df_ext <- subset(df, records_group == 'Extraordinary')
summary(df_ext)
## Date.Made.Public
## 2014-08-05:1
## 2016-12-14:1
## 2017-03-08:1
## 2006-01-01:0
## 2006-01-12:0
## 2006-01-16:0
## (Other) :0
## Company
## River City Media :1
## Russian hacking discovered by Hold Security :1
## Yahoo :1
## Spring Independent School District (Spring, TX):0
## Harvard University :0
## Penn State University :0
## (Other) :0
## State Type.of.breach Type.of.organization Total.Records
## California:1 CARD:0 BSF:0 Min. :1.000e+09
## Oregon :1 DISC:1 BSO:3 1st Qu.:1.185e+09
## Wisconsin :1 HACK:2 BSR:0 Median :1.370e+09
## Alabama :0 INSD:0 EDU:0 Mean :1.790e+09
## Alaska :0 PHYS:0 GOV:0 3rd Qu.:2.185e+09
## Arizona :0 PORT:0 MED:0 Max. :3.000e+09
## (Other) :0 STAT:0 NGO:0
## Description.of.incident
## "A gang of Russian hackers has amassed over 1 billion username and \npassword combinations and more than 500 million email addresses, a \nsecurity firm reported late Tuesday, calling it the largest-ever haul of\n stolen Internet credentials.The massive trove — stolen from \nhundreds of thousands of websites — was discovered by the Milwaukee firm\n Hold Security, according to a post on its website".According to reports by Hold Security, it took over seven months to identify the gang, "whom the firm dubbed CyberVor, or \ncyber-thief in Russian".\n \nIt appears that no payment card information or Social Security numbers were threatened.PRC will provide updates as the story unfolds.  *note: state location provided is that of Hold Security LLC. :1
## "One of the world's allegedly most prolific spamming operations inadvertently left backup databases accessible online, exposing upwards of 1.37 billion records and a raft of internal company information.Chris Vickery, a security researcher who works for the anti-virus company MacKeeper, discovered the databases, which belong to a US-based email and SMS marketing company called River City Media. In some cases, the records include the names, IP addresses, zip codes and physical addresses associated with the email addresses.The cause of the data exposure appears to be an oversight. The company used the rsync protocol to backup its MySQL databases. But those backup servers were not password-protected, Vickery says in an email to Information Security Media Group.The leak could be one of the largest of all time, but it's likely the databases contain duplicates. The databases, which were exposed for at least three months, have since been taken offline. It's unclear if other fraudsters or hackers may have already stumbled upon it. Some of records were updated as recently as January."If the databases were to be released in the wild, the damage would be astounding," Vickery says. "Abusive ex-boyfriends and stalkers everywhere would have a fresh new source of information on victims. You wouldn't feel the damage all at once, but society would indeed suffer over time."Based on preliminarily checks, at least some of the exposed data is legitimate, Vickery writes in a blog post."Investigating names from the list, through social media and work websites, usually shows that the additional details in the entry are most likely accurate," Vickery writes."More Information: http://www.databreachtoday.com/backup-error-exposes-137-billion-record-s... :1
## "Yahoo Inc (YHOO.O) warned on Wednesday that it had uncovered yet another massive cyber attack, saying data from more than 1 billion user accounts was compromised in August 2013, making it the largest breach in history.The number of affected accounts was double the number implicated in 2014 breach that the internet company disclosed in September and blamed on hackers working on behalf of a government.Yahoo required all of its customers to reset their passwords - a stronger measure than it took after the previous breach was discovered, when it only recommended a password reset. Yahoo also said Wednesday that it believes hackers responsible for the previous breach had also accessed the company’s proprietary code to learn how to forge "cookies" that would allow hackers to access an account without a password."Yahoo badly screwed up," said Bruce Schneier, a cryptologist and one of the world's most respected security experts. "They weren't taking security seriously and that's now very clear. I would have trouble trusting Yahoo going forward."Yahoo was tentative in its description of new problems, saying the incident was "likely" distinct from the one it reported in September and that stolen information "may have included" names, e-mail addresses, telephone numbers, dates of birth, hashed passwords and, in some cases, encrypted or unencrypted security questions and answers."More information: http://www.reuters.com/article/us-yahoo-cyber-idUSKBN1432WZYahoo statement: https//yahoo.com/security-updateUPDATE (2/15/2017):"Yahoo's newly issued warning to users about malicious hacks is related to a third data breach that the company disclosed in December 2016.A warning sent to some Yahoo users Wednesday read: "Based on the ongoing investigation, we believe a forged cookie may have been used in 2015 or 2016 to access your account."This breach was previously revealed in a December 2016 statement from Yahoo that also provided information on a separate hack that occurred in August 2013 involving more than 1 billion accounts. In addition, some of the 2015 and 2016 incidents have been tied to a "state-sponsored actor" that was involved in a different 2014 breach that affected up to 500 million accounts."Forged cookies" are digital keys that allow access to information without re-entering passwords. The leaked data included email addresses, birth dates and answers to security questions. Yahoo declined to say how many people were affected."More information: http://www.cnbc.com/2017/02/15/yahoo-sends-new-warning-to-customers-abou...UPDATE (3/15/2017): The U.S. Justice Department today unsealed indictments against four men accused of hacking into half-billion Yahoo email accounts. Two of the men named in the indictments worked for a unit of the Russian Federal Security Services (FSB) that serves as the FBI's point of contact in Moscow on cybercrime cases."More Information: http://krebsonsecurity.com/UPDATE (9/7/2017): Link to Yahoo judgement: https://www.documentcloud.org/documents/3986196-Yahoo-judgement-on-data-...UPDATE (10/3/2017): "Yahoo has tripled down on what was already the largest data breach in history, saying it affected all 3 billion accounts on its service, not the 1 billion it revealed late last year.The company announced Tuesday that it's providing notice to additional user accounts affected by the August 2013 data theft."More Information: http://hosted.ap.org/dynamic/stories/U/US_YAHOO_DATA_BREACH?SITE=AP&SECT...    :1
## :0
## \t \t \t \t \t"NEW YORK -- Millions of records from a commercial corporate database have been leaked. \t \t \t \t \tThe database, about 52 gigabytes in size, contains just under 33.7 million unique email addresses and other contact information from employees of thousands of companies, representing a large portion of the US corporate population. \t \t \t \t \tDun & Bradstreet, a business services giant, confirmed that it owns the database, which it acquired as part \tof a 2015 deal to buy NetProspex for $125 million.The purchased database contains dozens of fields, some including personal information such as names, job titles and functions, work email addresses, and phone numbers. \t \t \t \t \tOther information includes more generic corporate and publicly sourced data, such as believed office location, the number of employees in the business unit, and other descriptions of the kind of industry the company falls into, such as advertising, legal, media and broadcasting, and telecoms. \t \t \t \t \tThis entire database is used for marketers who want to directly target their own email campaigns and through other communications methods for current and prospective customers. \t \t \t \t \tThe data can be bought either in bulk, or by type of record by companies, but it's not known exactly how much the going rate is for a full data set of this size. We understand from \ta 2015 brochure that the cost of accessing a half-million records can cost some firms up to $200,000. \t \t \t \t \tTroy Hunt, who runs breach notification site \tHave I Been Pwned, obtained the database and analyzed the records. \t \t \t \t \tIn \ta blog post Tuesday, Hunt said the breakdown was entirely US-focused, with California as the most represented demographic with over four million records, then New York with 2.7 million records and Texas with 2.6 million records. \t \t \t \t \tHunt's analysis of the records showed that the leading organization by records is the Dept. of Defense, with 101,013 employee records, followed closely by the US Postal Service with 88,153 employee records. \t \t \t \t \tThe US Army, Air Force, and Dept. of Veterans Affairs are all listed with a combined 76,379 records. \t \t \t \t \tAT&T, Boeing, Dell, FedEx, IBM, and Xerox were among the most named companies in the database, with tens of thousands of employee records each. \t \t \t \t \t"Whilst you could piece together parts of the data from information already in the public domain, having it aggregated and so easily searchable in this fashion is enormously valuable," said Hunt in an email on Tuesday. "It also serves as a reminder that we've lost control of our privacy; the vast majority of people in the data set would have no idea their information is being sold in this fashion and they certainly don't have any control over it." \t \t \t \t \tHunt ran the exposed database through \tHave I Been Pwned's database of breached records, which showed 14 percent of email addresses already existed in his database. \t \t \t \t \tThe data is now searchable in \tHave I Been Pwned. \t \t \t \t \tBut it's not known exactly how the data was exposed, or who is to blame for the leak. \t \t \t \t \tA spokesperson for Dun & Bradshaw would not talk on the record beyond an emailed statement, sent prior to publication. \t \t \t \t \t"We've carefully evaluated the information that was shared with us and it is of a type and in a format that we deliver to customers every day. Based on our analysis, it was not accessed or exposed through a Dun & Bradstreet system," the statement read."More Information: http://www.zdnet.com/article/millions-of-records-leaked-from-huge-corpor...:0
## As\n reported by Health and Human Services unauthorized access/disclosure. No specific information as\n to what information was \ncompromised as provided by health and human services. More Information: https://ocrportal.hhs.gov/ocr/breach/breach_report.jsf;jsessionid=9BF4AF... :0
## (Other) :0
## Information.Source
## Media :3
## :0
## California Attorney General:0
## Databreaches.net :0
## Dataloss DB :0
## Government Agency :0
## (Other) :0
## Source.URL
## :3
## http://6abc.com/student-loan-data-breach-affects-16500-borrowers/3402556/ :0
## http://abc30.com/fresno-state-data-breach-exposes-personal-information-of-15000-people/3182146/ :0
## http://abc7.com/technology/30k-ucla-students-warned-about-potential-security-breach/2279390/ :0
## http://agportal-s3bucket.s3.amazonaws.com/uploadedfiles/Another/Supporting_Law_Enforcement/MultnomahAthleticClub.2018-01-10.pdf:0
## http://enewspaper.latimes.com/infinity/article_popover_share.aspx?guid=0511a587-c9aa-4ea2-a331-64f54856baeb :0
## (Other) :0
## Year.of.Breach Latitude Longitude breach_type_gA
## Min. :2014 Min. :37.37 Min. :-122.68 Hacker:2
## 1st Qu.:2015 1st Qu.:39.63 1st Qu.:-122.36 Others:1
## Median :2016 Median :41.90 Median :-122.04
## Mean :2016 Mean :41.60 Mean :-110.89
## 3rd Qu.:2016 3rd Qu.:43.71 3rd Qu.:-104.99
## Max. :2017 Max. :45.52 Max. : -87.95
##
## breach_type_gB breach_type_gC breach_type_gD region_A region_B
## Insider :1 Offline:0 Local :0 Midwest :1 Midwest :1
## Mixed :0 Online :2 Network:3 Non_US :0 Non_US :0
## Outsider:2 Others :1 Others :0 Northeast:0 Northeast:0
## South :0 Southeast:0
## West :2 Southwest:0
## West :2
##
## age_of_Law org_group records_group records_index
## 12 :1 F_regulated :0 Extraordinary:3 Min. :0.5587
## 13 :1 non_F_regulated:3 Others :0 1st Qu.:0.6620
## 16 :1 Median :0.7654
## 0 :0 Mean :1.0000
## 10 :0 3rd Qu.:1.2207
## 11 :0 Max. :1.6760
## (Other):0
## records_scale type_scale org_scale severity_index1 severity_index2
## Min. :10 Min. : 1.0 Min. :6 Min. :11.0 Min. :16
## 1st Qu.:10 1st Qu.: 5.5 1st Qu.:6 1st Qu.:15.5 1st Qu.:16
## Median :10 Median :10.0 Median :6 Median :20.0 Median :16
## Mean :10 Mean : 7.0 Mean :6 Mean :17.0 Mean :16
## 3rd Qu.:10 3rd Qu.:10.0 3rd Qu.:6 3rd Qu.:20.0 3rd Qu.:16
## Max. :10 Max. :10.0 Max. :6 Max. :20.0 Max. :16
##
## severity_index3 severity_index4 severity_index5
## Min. :17.0 Min. :5.667 Min. : 5.333
## 1st Qu.:21.5 1st Qu.:7.167 1st Qu.: 5.333
## Median :26.0 Median :8.667 Median : 5.333
## Mean :23.0 Mean :7.667 Mean :10.333
## 3rd Qu.:26.0 3rd Qu.:8.667 3rd Qu.:12.833
## Max. :26.0 Max. :8.667 Max. :20.333
##
# subset of data breaches excluing the extraordinary incidents
df <- subset(df, region_A != 'Extraordinary')
head(df)
## Date.Made.Public Company State Type.of.breach
## 1 2009-10-21 Bullitt County Public Schools Kentucky DISC
## 2 2009-10-21 Roane State Community College Tennessee PORT
## 3 2009-10-15 Halifax Health Florida PORT
## 4 2009-10-04 Suffolk Community College New York DISC
## 5 2009-09-28 Penrose Hospital Colorado PHYS
## 6 2009-09-23 Eastern Kentucky University Kentucky DISC
## Type.of.organization Total.Records
## 1 EDU 676
## 2 EDU 14783
## 3 MED 33000
## 4 EDU 300
## 5 MED 175
## 6 EDU 5045
## Description.of.incident
## 1 A Bullitt County Public Schools \n employee accidentally sent an e-mail message to about 1,800 school \n district workers that included the names and Social Security numbers \n of 676 district employees. The employees were identified as not having \n completed the district's 2010 open-enrollment process for insurance, \n and the e-mail was intended as a reminder to complete the process.
## 2 Roane State Community College \n has announced that the names and Social Security numbers of 9,747 \n current or former students were on a data storage device stolen from \n an employee's vehicle, along with 1,194 current/former employees' \n information. The Social Security numbers alone, with no names, were \n also stolen for 5,036 additional current or former students. The data \n was on a 4GB USB drive used for work-related purposes. An employee \n took it home to do work after hours, and left it in the car. The employee \n forgot to lock the car doors. The USB drive was stolen along with \n a personal hand-held device.
## 3 A laptop computer from a Halifax \n Health employee's vehicle in Orange County was stolen -- which might \n have contained password protected patient information.
## 4 Suffolk Community College has \n agreed to pay a company for the next year to monitor the credit of \n 300 students whose last names and Social Security numbers were mistakenly \n listed in an attachment to an e-mail sent to those students last month.
## 5 Officials at Penrose Hospital believe someone has stolen the personal information of 175 patients. The missing information consists of names, addresses, phone numbers, Social Security numbers and the reason for the patients' visits. The information was stored on a computer print-out and kept in a binder stored in a cabinet. The print out has gone missing.
## 6 The names and Social Security \n numbers of about 5,000 Eastern Kentucky University faculty, staff \n and student workers were posted inadvertently on the Internet last \n September, where they have been displayed for a year.
## Information.Source Source.URL Year.of.Breach Latitude Longitude
## 1 Dataloss DB 2009 37.98840 -85.71579
## 2 Dataloss DB 2009 35.93396 -84.55244
## 3 Dataloss DB 2009 29.21082 -81.02283
## 4 Dataloss DB 2009 40.86649 -73.03566
## 5 Dataloss DB 2009 38.83388 -104.82136
## 6 Dataloss DB 2009 37.74786 -84.29465
## breach_type_gA breach_type_gB breach_type_gC breach_type_gD region_A
## 1 Others Insider Others Network South
## 2 Others Mixed Offline Local South
## 3 Others Mixed Offline Local South
## 4 Others Insider Others Network Northeast
## 5 Others Mixed Offline Local West
## 6 Others Insider Others Network South
## region_B age_of_Law org_group records_group records_index
## 1 Southeast 0 F_regulated Others 0.0008129202
## 2 Southeast 14 F_regulated Others 0.0177772189
## 3 Southeast 14 F_regulated Others 0.0396839765
## 4 Northeast 14 F_regulated Others 0.0003607634
## 5 West 13 F_regulated Others 0.0002104453
## 6 Southeast 0 F_regulated Others 0.0060668382
## records_scale type_scale org_scale severity_index1 severity_index2
## 1 10 1 8 11 18
## 2 10 7 8 17 18
## 3 10 7 10 17 20
## 4 10 1 8 11 18
## 5 10 7 10 17 20
## 6 10 1 8 11 18
## severity_index3 severity_index4 severity_index5
## 1 19 6.333333 22.333333
## 2 25 8.333333 2.333333
## 3 27 9.000000 3.000000
## 4 19 6.333333 22.333333
## 5 27 9.000000 3.000000
## 6 19 6.333333 22.333333
# check the dimention, column names and structure of the dataset
dim(df)
## [1] 5355 30
names(df)
## [1] "Date.Made.Public" "Company"
## [3] "State" "Type.of.breach"
## [5] "Type.of.organization" "Total.Records"
## [7] "Description.of.incident" "Information.Source"
## [9] "Source.URL" "Year.of.Breach"
## [11] "Latitude" "Longitude"
## [13] "breach_type_gA" "breach_type_gB"
## [15] "breach_type_gC" "breach_type_gD"
## [17] "region_A" "region_B"
## [19] "age_of_Law" "org_group"
## [21] "records_group" "records_index"
## [23] "records_scale" "type_scale"
## [25] "org_scale" "severity_index1"
## [27] "severity_index2" "severity_index3"
## [29] "severity_index4" "severity_index5"
str(df)
## 'data.frame': 5355 obs. of 30 variables:
## $ Date.Made.Public : Factor w/ 2548 levels "2006-01-01","2006-01-12",..: 692 692 690 686 684 682 681 681 680 678 ...
## $ Company : Factor w/ 4732 levels " Spring Independent School District (Spring, TX)",..: 602 3348 1663 3780 3080 1267 453 3407 3366 4246 ...
## $ State : Factor w/ 61 levels "Alabama","Alaska",..: 23 53 13 39 9 23 54 8 61 13 ...
## $ Type.of.breach : Factor w/ 7 levels "CARD","DISC",..: 2 6 6 2 5 2 6 5 2 2 ...
## $ Type.of.organization : Factor w/ 7 levels "BSF","BSO","BSR",..: 4 4 6 4 6 4 1 6 1 4 ...
## $ Total.Records : num 676 14783 33000 300 175 ...
## $ Description.of.incident: Factor w/ 4371 levels ""," \t \t \t \t \t\"NEW YORK -- Millions of records from a commercial corporate database have been leaked. \t \t \"| __truncated__,..: 331 3353 918 3515 2871 3929 2639 4157 520 2453 ...
## $ Information.Source : Factor w/ 16 levels "","California Attorney General",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Source.URL : Factor w/ 95 levels "","http://6abc.com/student-loan-data-breach-affects-16500-borrowers/3402556/",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Year.of.Breach : int 2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
## $ Latitude : num 38 35.9 29.2 40.9 38.8 ...
## $ Longitude : num -85.7 -84.6 -81 -73 -104.8 ...
## $ breach_type_gA : Factor w/ 2 levels "Hacker","Others": 2 2 2 2 2 2 2 2 2 2 ...
## $ breach_type_gB : Factor w/ 3 levels "Insider","Mixed",..: 1 2 2 1 2 1 2 2 1 1 ...
## $ breach_type_gC : Factor w/ 3 levels "Offline","Online",..: 3 1 1 3 1 3 1 1 3 3 ...
## $ breach_type_gD : Factor w/ 3 levels "Local","Network",..: 2 1 1 2 1 2 1 1 2 2 ...
## $ region_A : Factor w/ 5 levels "Midwest","Non_US",..: 4 4 4 3 5 4 4 5 5 4 ...
## $ region_B : Factor w/ 6 levels "Midwest","Non_US",..: 4 4 4 3 6 4 5 6 6 4 ...
## $ age_of_Law : Factor w/ 18 levels "0","10","11",..: 1 6 6 6 5 1 2 7 4 6 ...
## $ org_group : Factor w/ 2 levels "F_regulated",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ records_group : Factor w/ 2 levels "Extraordinary",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ records_index : num 0.000813 0.017777 0.039684 0.000361 0.00021 ...
## $ records_scale : num 10 10 10 10 10 ...
## $ type_scale : int 1 7 7 1 7 1 7 7 1 1 ...
## $ org_scale : int 8 8 10 8 10 8 9 10 9 8 ...
## $ severity_index1 : num 11 17 17 11 17 ...
## $ severity_index2 : num 18 18 20 18 20 ...
## $ severity_index3 : num 19 25 27 19 27 ...
## $ severity_index4 : num 6.33 8.33 9 6.33 9 ...
## $ severity_index5 : num 22.33 2.33 3 22.33 3 ...
The dataset has 5574 rows with 27 columns. The data type of age_of_Law is Factor, need to be converted to integer.
# convert data type of age_of_Law from Factor to integer
df$age_of_Law <- as.numeric(as.character(df$age_of_Law))
class(df$age_of_Law)
## [1] "numeric"
# description of the dataset
summary(df)
## Date.Made.Public Company State
## 2016-02-26: 18 University of Florida : 10 California: 657
## 2013-11-08: 15 Experian : 7 New York : 407
## 2011-12-09: 14 Henry Ford Health System : 7 Texas : 398
## 2012-08-03: 12 Mount Sinai Medical Center: 7 Florida : 320
## 2013-08-28: 10 Private Medical Practice : 7 Maryland : 272
## 2012-07-17: 9 Walgreen Co. : 7 Illinois : 226
## (Other) :5277 (Other) :5310 (Other) :3075
## Type.of.breach Type.of.organization Total.Records
## CARD: 32 BSF: 321 Min. :1.000e+00
## DISC:1268 BSO: 382 1st Qu.:6.520e+02
## HACK:1344 BSR: 250 Median :2.006e+03
## INSD: 357 EDU: 565 Mean :1.811e+06
## PHYS:1362 GOV: 465 3rd Qu.:1.018e+04
## PORT: 819 MED:3310 Max. :3.000e+09
## STAT: 173 NGO: 62
## Description.of.incident
## Location of breached information: Network Server\nBusiness associate present: No\n : 147
## Location of breached information: Email\nBusiness associate present: No\n : 107
## Location of breached information: Paper/Films\nBusiness associate present: No\n : 90
## \\N\nLocation of breached information: Laptop\nBusiness associate present: No\n : 80
## \\N\nLocation of breached information: Paper/Films\nBusiness associate present: No\n : 67
## \\N\nLocation of breached information: Desktop Computer\nBusiness associate present: No\n: 48
## (Other) :4816
## Information.Source
## US Department of Health and Human Services:1946
## Dataloss DB :1213
## Media : 564
## Databreaches.net : 445
## PHIPrivacy.net : 370
## Government Agency : 238
## (Other) : 579
## Source.URL
## :3125
## https://ocrportal.hhs.gov/ocr/breach/breach_report.jsf :1947
## http://www.marylandattorneygeneral.gov/Pages/IdentityTheft/breachnotices.aspx : 157
## http://www.marylandattorneygeneral.gov/Pages/IdentityTheft/breachnotices.aspx?subfolder=2015 : 36
## http://www.healthcareitnews.com/news/hackers-breach-new-yorks-largest-provider-phishing-attacks : 2
## http://www.healthcareitnews.com/news/surgery-center-says-34000-patient-records-potentially-breached: 2
## (Other) : 86
## Year.of.Breach Latitude Longitude breach_type_gA
## Min. :2006 Min. :-34.60 Min. :-158.06 Hacker:1344
## 1st Qu.:2010 1st Qu.: 36.78 1st Qu.: -95.71 Others:4011
## Median :2012 Median : 40.71 Median : -81.03
## Mean :2012 Mean : 38.66 Mean : -86.96
## 3rd Qu.:2015 3rd Qu.: 40.76 3rd Qu.: -73.98
## Max. :2017 Max. : 64.84 Max. : 0.00
##
## breach_type_gB breach_type_gC breach_type_gD region_A
## Insider :1625 Offline:2181 Local :2354 Midwest :1086
## Mixed :2354 Online :1376 Network:2644 Non_US : 0
## Outsider:1376 Others :1798 Others : 357 Northeast:1030
## South :1956
## West :1283
##
##
## region_B age_of_Law org_group
## Midwest :1086 Min. : 0.00 F_regulated :4196
## Non_US : 0 1st Qu.:12.00 non_F_regulated:1159
## Northeast:1315 Median :13.00
## Southeast:1226 Mean :12.55
## Southwest: 579 3rd Qu.:14.00
## West :1149 Max. :16.00
##
## records_group records_index records_scale type_scale
## Extraordinary: 3 Min. : 0.0000 Min. : 7.311 Min. : 1.00
## Others :5352 1st Qu.: 0.0008 1st Qu.:10.000 1st Qu.: 7.00
## Median : 0.0024 Median :10.000 Median : 7.00
## Mean : 0.9722 Mean : 9.972 Mean : 6.35
## 3rd Qu.: 0.0122 3rd Qu.:10.000 3rd Qu.:10.00
## Max. :601.2724 Max. :10.000 Max. :10.00
##
## org_scale severity_index1 severity_index2 severity_index3
## Min. : 1.000 Min. : 8.311 Min. :10.31 Min. :12.00
## 1st Qu.: 8.000 1st Qu.:17.000 1st Qu.:18.00 1st Qu.:21.00
## Median :10.000 Median :17.000 Median :20.00 Median :27.00
## Mean : 8.231 Mean :16.323 Mean :18.20 Mean :24.55
## 3rd Qu.:10.000 3rd Qu.:19.999 3rd Qu.:20.00 3rd Qu.:27.00
## Max. :10.000 Max. :20.000 Max. :20.00 Max. :30.00
##
## severity_index4 severity_index5
## Min. : 4.000 Min. : 0.000
## 1st Qu.: 7.000 1st Qu.: 2.883
## Median : 9.000 Median : 3.000
## Mean : 8.185 Mean : 9.816
## 3rd Qu.: 9.000 3rd Qu.:21.000
## Max. :10.000 Max. :27.000
##
# type of breach
ggplot(df, aes(Type.of.breach)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
ylab('Percentage')
# chi-square test
chisq.test(table(df$Type.of.breach))
##
## Chi-squared test for given probabilities
##
## data: table(df$Type.of.breach)
## X-squared = 2616.7, df = 6, p-value < 2.2e-16
‘PHYS’, ‘DISC’ and ‘HACK’ types (3 types out of 8) consist about 70% of the breach.
# breach type group B
ggplot(df) +
geom_bar(mapping=aes(breach_type_gB,y=..prop.., group=1), stat = "count")
# chi-square test
chisq.test(table(df$breach_type_gB))
##
## Chi-squared test for given probabilities
##
## data: table(df$breach_type_gB)
## X-squared = 289.44, df = 2, p-value < 2.2e-16
# breach type group C
ggplot(df) +
geom_bar(mapping=aes(breach_type_gC,y=..prop.., group=1), stat = "count")
# chi-square test
chisq.test(table(df$breach_type_gC))
##
## Chi-squared test for given probabilities
##
## data: table(df$breach_type_gC)
## X-squared = 181.66, df = 2, p-value < 2.2e-16
# breach type group D
ggplot(df) +
geom_bar(mapping=aes(breach_type_gD,y=..prop.., group=1), stat = "count")
# chi-square test
chisq.test(table(df$breach_type_gD))
##
## Chi-squared test for given probabilities
##
## data: table(df$breach_type_gD)
## X-squared = 1737.2, df = 2, p-value < 2.2e-16
# Type.of.organization
ggplot(df, aes(Type.of.organization)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
ylab('Percentage')
# chi-square test
chisq.test(table(df$Type.of.organization))
##
## Chi-squared test for given probabilities
##
## data: table(df$Type.of.organization)
## X-squared = 10079, df = 6, p-value < 2.2e-16
More than 60% of the breach are from medical organization.
# org_group
ggplot(df) +
geom_bar(mapping=aes(org_group,y=..prop.., group=1), stat = "count")
# chi-square test
chisq.test(table(df$org_group))
##
## Chi-squared test for given probabilities
##
## data: table(df$org_group)
## X-squared = 1722.4, df = 1, p-value < 2.2e-16
# Total Records
summary(df$Total.Records)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000e+00 6.520e+02 2.006e+03 1.811e+06 1.018e+04 3.000e+09
p1 <- qplot(x = Total.Records, data = df,
xlab = 'US Total Records', ylab = 'Count')
p2 <- qplot(x = Total.Records, data = df,
xlab = 'US Total Records (log10)', ylab = 'Count') +
scale_x_log10()
# p3 <- qplot(x = Total.Records, data = subset(df, State == 'California'), xlab = 'CA Total Records(log10)', ylab = 'Count') +
# scale_x_log10()
grid.arrange(p1, p2, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of total records is heavily right skewed. After log tranformation, it looks like normal distribution with a mean at 1000.
# frequency polygon of incidents by year
ggplot() +
geom_freqpoly(aes(Year.of.Breach), data = df, color = 'blue',
stat = 'count') +
geom_freqpoly(aes(Year.of.Breach), data = subset(df, State == 'California'),
color = 'red', stat = 'count') +
scale_x_continuous(breaks = seq(2006, 2017, 1))
# chi-square test
chisq.test(table(df$Year.of.Breach))
##
## Chi-squared test for given probabilities
##
## data: table(df$Year.of.Breach)
## X-squared = 495.62, df = 11, p-value < 2.2e-16
# density plot of incidents by year
names(df)
## [1] "Date.Made.Public" "Company"
## [3] "State" "Type.of.breach"
## [5] "Type.of.organization" "Total.Records"
## [7] "Description.of.incident" "Information.Source"
## [9] "Source.URL" "Year.of.Breach"
## [11] "Latitude" "Longitude"
## [13] "breach_type_gA" "breach_type_gB"
## [15] "breach_type_gC" "breach_type_gD"
## [17] "region_A" "region_B"
## [19] "age_of_Law" "org_group"
## [21] "records_group" "records_index"
## [23] "records_scale" "type_scale"
## [25] "org_scale" "severity_index1"
## [27] "severity_index2" "severity_index3"
## [29] "severity_index4" "severity_index5"
ggplot() +
geom_density(aes(Year.of.Breach), data = df, color = 'blue') +
geom_density(aes(Year.of.Breach), data = subset(df, State == 'California'),
color = 'red') +
scale_x_continuous(breaks = seq(2006, 2017, 1))
# region B
ggplot(df, aes(region_B)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
ylab('Percentage')
# chi-square test
chisq.test(table(df$region_B))
##
## Chi-squared test for given probabilities
##
## data: table(df$region_B)
## X-squared = 1442.9, df = 5, p-value < 2.2e-16
Southwest, which includes ‘Texas’, ‘Oklahoma’, ‘New Mexico’, ‘Arizona’, has relatively less data breach events than other regions.
# age of law
qplot(x = age_of_Law, data = df,
xlab = 'Age of Law', ylab = 'Count')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# severity index 1
ggplot(df, aes(x=severity_index1)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This distribution of severity index 1 is a little left skewed. The peak appears at 17.
# severity index 2
ggplot(df, aes(x=severity_index2)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This distribution of severity index 2 is a heavily left skewed. The peak appears at 20.
# severity index 3
ggplot(df, aes(x=severity_index3)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# severity index 4
ggplot(df, aes(x=severity_index4)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# severity index 5
ggplot(df, aes(x=severity_index5)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df %>%
count(Year.of.Breach, Type.of.breach) %>%
ggplot(aes(Year.of.Breach, n, group = Type.of.breach, color = Type.of.breach)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Count')
All types have some flutuations, breach of ‘HACK’ and ‘DISC’ type are increasing over the years and have more incidents. The other types are decreasing and have less incidents. Organizations more vulnerable to ‘HACK’ and ‘DISC’ need to pay more attention. In the following part, we will explore which organizations are more vulnerable to ‘HACK’ and ‘DISC’.
df %>%
count(Year.of.Breach, breach_type_gB) %>%
ggplot(aes(Year.of.Breach, n, group = breach_type_gB, color = breach_type_gB)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Count')
Breaches caused by outsiders have been increasing during these years, while breaches caused by insiders have not been increasing too much since year 2013.
io_df <- df %>%
subset(breach_type_gB %in% c('Insider', 'Outsider')) %>%
count(Year.of.Breach, breach_type_gB) %>%
data.frame() %>%
reshape(idvar = "Year.of.Breach", timevar = 'breach_type_gB', direction = "wide")
io_df$diff <- io_df$n.Outsider - io_df$n.Insider
ggplot(data = io_df, aes(x = Year.of.Breach, y = diff)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Count')
df %>%
count(Year.of.Breach, breach_type_gC) %>%
ggplot(aes(Year.of.Breach, n, group = breach_type_gC, color = breach_type_gC)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Count')
Breaches from offline have been dramatically decresing since year 2010. Breaches from online have been increasing.
df %>%
count(Year.of.Breach, breach_type_gD) %>%
ggplot(aes(Year.of.Breach, n, group = breach_type_gD, color = breach_type_gD)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Count')
Breaches from network have been increasing while breaches from local have been decreasing.
# organizations vulnerable to 'HACK' and 'DISC'
subset(df, Type.of.breach %in% c('HACK', 'DISC')) %>%
ggplot(aes(Type.of.organization)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
xlab('Type of organization') +
ylab('Percentage')
# organizations vulnerable to 'HACK'
subset(df, Type.of.breach=='HACK') %>%
ggplot(aes(Type.of.organization)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
xlab('Type of organization') +
ylab('Percentage')
# organizations vulnerable to 'DISC'
subset(df, Type.of.breach=='DISC') %>%
ggplot(aes(Type.of.organization)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
xlab('Type of organization') +
ylab('Percentage')
# org vs type
ggplot(df, aes(Type.of.organization, fill =Type.of.breach )) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
xlab('Type of organization') +
ylab('Percentage')
More than 50% of the incidents for ‘BSO’, ‘BSR’, ‘EDU’ are of type ‘HACK’ and ‘DISC’.
df %>%
count(Year.of.Breach, Type.of.organization) %>%
ggplot(aes(Year.of.Breach, n, group = Type.of.organization, color = Type.of.organization)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Count')
Breach in medical is dramatically increasing especially after year 2009. Breach in ‘BSO’ reached its peak in year 2011, it has been decreasing through year 2015 but starts to rebound after that.
So medical and ‘BSO’ need to pay relatively more attention to data breach.
df %>%
count(Year.of.Breach, org_group) %>%
ggplot(aes(Year.of.Breach, n, group = org_group, color = org_group)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Count')
Although there are some fluctuations. For the overall trend, breaches in federal regulated organizations are increasing, breaches in non federal regulated organizations are decreasing.
# heat map filled with incidents indensity
df %>%
group_by(Type.of.organization, Type.of.breach) %>%
summarise( n = n()) %>%
mutate(incidents_indensity = n / sum(n)) %>%
ggplot(aes(Type.of.organization, Type.of.breach)) +
geom_tile(aes(fill = incidents_indensity)) +
geom_text(aes(label = round(incidents_indensity,2))) +
scale_fill_continuous(low = "white", high = "black")
# Chi-Square Test of Independence
chisq.test(table(df$Type.of.organization, df$Type.of.breach))
##
## Pearson's Chi-squared test
##
## data: table(df$Type.of.organization, df$Type.of.breach)
## X-squared = 1194.7, df = 36, p-value < 2.2e-16
‘BSF’ is more prone to ‘HACK’ and ‘PORT’. ‘BSO’ and ‘BSR’ are more prone to ‘HACK’. ‘EDU’ is more prone to ‘DISC’ and ‘HACK’. ‘GOV’ is more prone to ‘PORT’. ‘MED’ is more prone to ‘PHYS’. ‘NGO’ is more prone to ‘PORT’ and ‘HACK’.
# heat map filled with records indensity
df %>%
group_by(Type.of.organization, Type.of.breach) %>%
summarise(total_records = sum(Total.Records)) %>%
mutate(records_indensity = total_records / sum(total_records)) %>%
ggplot(aes(Type.of.organization, Type.of.breach)) +
geom_tile(aes(fill = records_indensity)) +
geom_text(aes(label = round(records_indensity,2))) +
scale_fill_continuous(low = "white", high = "black")
aggregate(df$severity_index1, list(df$Year.of.Breach), mean) %>%
ggplot(aes(Group.1, x)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Average severity index 1')
aggregate(df$severity_index2, list(df$Year.of.Breach), mean) %>%
ggplot(aes(Group.1, x)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Average severity index 2')
aggregate(df$severity_index3, list(df$Year.of.Breach), mean) %>%
ggplot(aes(Group.1, x)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Average severity index 3')
aggregate(df$severity_index4, list(df$Year.of.Breach), mean) %>%
ggplot(aes(Group.1, x)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Average severity index 4')
aggregate(df$severity_index5, list(df$Year.of.Breach), mean) %>%
ggplot(aes(Group.1, x)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Average severity index 5')
mean_index <- aggregate(df[, 26:30], list(df$Year.of.Breach), mean)
melt_mean_index <- melt(mean_index, id.vars="Group.1")
ggplot(melt_mean_index, aes(Group.1, value, col=variable)) +
geom_line() +
scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') +
ylab('Average severity index')
# heat map filled with severity_index1
df %>%
group_by(Type.of.organization, Type.of.breach) %>%
summarise(severity_index = sum(severity_index1)) %>%
mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
ggplot(aes(Type.of.organization, Type.of.breach)) +
geom_tile(aes(fill = severity_index_indensity)) +
geom_text(aes(label = round(severity_index_indensity,2))) +
scale_fill_continuous(low = "white", high = "black")
In consideration of severity_index1: - BSF need to pay more attention to PORT and HACK - BSO/BSR/EDU need to pay more attention to HACK - GOV need to pay more attention to PORT and DISC - MED need to pay more attention to PHYS and HACK - NGO need to pay more attention to PORT and HACK
# heat map filled with severity_index2
df %>%
group_by(Type.of.organization, Type.of.breach) %>%
summarise(severity_index = sum(severity_index2)) %>%
mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
ggplot(aes(Type.of.organization, Type.of.breach)) +
geom_tile(aes(fill = severity_index_indensity)) +
geom_text(aes(label = round(severity_index_indensity,2))) +
scale_fill_continuous(low = "white", high = "black")
In terms of severity_index2: - BSF need to pay more attention to PORT and HACK - BSO/BSR need to pay more attention to HACK - EDU need to pay more attention to HACK and DISC (diff) - GOV need to pay more attention to PORT and DISC - MED need to pay more attention to PHYS (diff) - NGO need to pay more attention to PORT and HACK
# heat map filled with severity_index3
df %>%
group_by(Type.of.organization, Type.of.breach) %>%
summarise(severity_index = sum(severity_index3)) %>%
mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
ggplot(aes(Type.of.organization, Type.of.breach)) +
geom_tile(aes(fill = severity_index_indensity)) +
geom_text(aes(label = round(severity_index_indensity,2))) +
scale_fill_continuous(low = "white", high = "black")
In terms of severity_index2: - BSF need to pay more attention to PORT and HACK - BSO/BSR need to pay more attention to HACK - EDU need to pay more attention to HACK and DISC - GOV need to pay more attention to PORT and DISC - MED need to pay more attention to PHYS - NGO need to pay more attention to PORT and HACK
# heat map filled with severity_index4
df %>%
group_by(Type.of.organization, Type.of.breach) %>%
summarise(severity_index = sum(severity_index4)) %>%
mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
ggplot(aes(Type.of.organization, Type.of.breach)) +
geom_tile(aes(fill = severity_index_indensity)) +
geom_text(aes(label = round(severity_index_indensity,2))) +
scale_fill_continuous(low = "white", high = "black")
# heat map filled with severity_index5
df %>%
group_by(Type.of.organization, Type.of.breach) %>%
summarise(severity_index = sum(severity_index5)) %>%
mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
ggplot(aes(Type.of.organization, Type.of.breach)) +
geom_tile(aes(fill = severity_index_indensity)) +
geom_text(aes(label = round(severity_index_indensity,2))) +
scale_fill_continuous(low = "white", high = "black")
# State vs. severity_index1
state_index <- df %>%
group_by(State) %>%
summarise(mean = mean(severity_index1))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
## State mean
## <fct> <dbl>
## 1 North Dakota 18
## 2 Delaware 17.9
## 3 South Dakota 17.5
## 4 Nebraska 17.2
## 5 Nevada 17.1
## 6 Vermont 17.0
## 7 Idaho 17.0
## 8 New Hampshire 17.0
## 9 Arizona 16.8
## 10 Oklahoma 16.8
## # … with 40 more rows
# Type.of.breach vs. severity_index1
ggplot(data=df, aes(x = Type.of.breach, y = severity_index1)) +
geom_boxplot()
# breach_type_gB vs. severity_index1
ggplot(data=df, aes(x = breach_type_gB, y = severity_index1)) +
geom_boxplot()
# breach_type_gC vs. severity_index1
ggplot(data=df, aes(x = breach_type_gC, y = severity_index1)) +
geom_boxplot()
# Type.of.organization vs. severity_index1
ggplot(data=df, aes(x = Type.of.organization, y = severity_index1)) +
geom_boxplot()
# org_group vs. severity_index1
ggplot(data=df, aes(x = org_group, y = severity_index1)) +
geom_boxplot()
# region_B vs. severity_index1
ggplot(data=df, aes(x = region_B, y = severity_index1)) +
geom_boxplot()
# age_of_Law vs. severity_index1
ggplot(aes(x = age_of_Law, y = severity_index1), data = df) +
geom_point(alpha = 0.5)
# correlation test
cor.test(df$age_of_Law, df$severity_index2)
##
## Pearson's product-moment correlation
##
## data: df$age_of_Law and df$severity_index2
## t = -2.8481, df = 5353, p-value = 0.004414
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06561495 -0.01212663
## sample estimates:
## cor
## -0.03889865
# State vs. severity_index2
state_index <- df %>%
group_by(State) %>%
summarise(mean = mean(severity_index2))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
## State mean
## <fct> <dbl>
## 1 North Dakota 19.6
## 2 Kentucky 19.1
## 3 South Dakota 18.9
## 4 Tennessee 18.8
## 5 Missouri 18.8
## 6 Montana 18.7
## 7 Maryland 18.7
## 8 Alabama 18.7
## 9 Florida 18.7
## 10 Indiana 18.6
## # … with 40 more rows
# Type.of.breach vs. severity_index2
ggplot(data=df, aes(x = Type.of.breach, y = severity_index2)) +
geom_boxplot()
# breach_type_gB vs. severity_index2
ggplot(data=df, aes(x = breach_type_gB, y = severity_index2)) +
geom_boxplot()
# breach_type_gC vs. severity_index2
ggplot(data=df, aes(x = breach_type_gC, y = severity_index2)) +
geom_boxplot()
# Type.of.organization vs. severity_index2
ggplot(data=df, aes(x = Type.of.organization, y = severity_index2)) +
geom_boxplot()
# org_group vs. severity_index2
ggplot(data=df, aes(x = org_group, y = severity_index2)) +
geom_boxplot()
# region_B vs. severity_index2
ggplot(data=df, aes(x = region_B, y = severity_index2)) +
geom_boxplot()
# age_of_Law vs. severity_index2
ggplot(aes(x = age_of_Law, y = severity_index2), data = df) +
geom_point(alpha = 0.5)
# correlation test
cor.test(df$age_of_Law, df$severity_index2)
##
## Pearson's product-moment correlation
##
## data: df$age_of_Law and df$severity_index2
## t = -2.8481, df = 5353, p-value = 0.004414
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06561495 -0.01212663
## sample estimates:
## cor
## -0.03889865
# State vs. severity_index3
state_index <- df %>%
group_by(State) %>%
summarise(mean = mean(severity_index3))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
## State mean
## <fct> <dbl>
## 1 North Dakota 27.6
## 2 South Dakota 26.5
## 3 Nebraska 25.8
## 4 Delaware 25.5
## 5 Nevada 25.5
## 6 Maryland 25.3
## 7 Arizona 25.2
## 8 Kentucky 25.0
## 9 Oklahoma 25.0
## 10 Texas 25.0
## # … with 40 more rows
# Type.of.breach vs. severity_index3
ggplot(data=df, aes(x = Type.of.breach, y = severity_index3)) +
geom_boxplot()
# breach_type_gB vs. severity_index3
ggplot(data=df, aes(x = breach_type_gB, y = severity_index3)) +
geom_boxplot()
# breach_type_gC vs. severity_index3
ggplot(data=df, aes(x = breach_type_gC, y = severity_index3)) +
geom_boxplot()
# Type.of.organization vs. severity_index3
ggplot(data=df, aes(x = Type.of.organization, y = severity_index3)) +
geom_boxplot()
# org_group vs. severity_index3
ggplot(data=df, aes(x = org_group, y = severity_index3)) +
geom_boxplot()
# region_B vs. severity_index3
ggplot(data=df, aes(x = region_B, y = severity_index3)) +
geom_boxplot()
# age_of_Law vs. severity_index3
ggplot(aes(x = age_of_Law, y = severity_index3), data = df) +
geom_point(alpha = 0.5)
# correlation test
cor.test(df$age_of_Law, df$severity_index3)
##
## Pearson's product-moment correlation
##
## data: df$age_of_Law and df$severity_index3
## t = -0.78307, df = 5353, p-value = 0.4336
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03747623 0.01608696
## sample estimates:
## cor
## -0.01070231
# State vs. severity_index4
state_index <- df %>%
group_by(State) %>%
summarise(mean = mean(severity_index4))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
## State mean
## <fct> <dbl>
## 1 North Dakota 9.19
## 2 South Dakota 8.82
## 3 Nebraska 8.59
## 4 Delaware 8.51
## 5 Nevada 8.51
## 6 Maryland 8.43
## 7 Arizona 8.40
## 8 Kentucky 8.32
## 9 Oklahoma 8.32
## 10 Texas 8.32
## # … with 40 more rows
# Type.of.breach vs. severity_index4
ggplot(data=df, aes(x = Type.of.breach, y = severity_index4)) +
geom_boxplot()
# breach_type_gB vs. severity_index4
ggplot(data=df, aes(x = breach_type_gB, y = severity_index4)) +
geom_boxplot()
# breach_type_gC vs. severity_index4
ggplot(data=df, aes(x = breach_type_gC, y = severity_index4)) +
geom_boxplot()
# Type.of.organization vs. severity_index4
ggplot(data=df, aes(x = Type.of.organization, y = severity_index4)) +
geom_boxplot()
# org_group vs. severity_index4
ggplot(data=df, aes(x = org_group, y = severity_index4)) +
geom_boxplot()
# region_B vs. severity_index4
ggplot(data=df, aes(x = region_B, y = severity_index4)) +
geom_boxplot()
# age_of_Law vs. severity_index4
ggplot(aes(x = age_of_Law, y = severity_index4), data = df) +
geom_point(alpha = 0.5)
# correlation test
cor.test(df$age_of_Law, df$severity_index4)
##
## Pearson's product-moment correlation
##
## data: df$age_of_Law and df$severity_index4
## t = -0.78307, df = 5353, p-value = 0.4336
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03747623 0.01608696
## sample estimates:
## cor
## -0.01070231
# State vs. severity_index5
state_index <- df %>%
group_by(State) %>%
summarise(mean = mean(severity_index5))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
## State mean
## <fct> <dbl>
## 1 West Virginia 15.3
## 2 Maine 13.7
## 3 Utah 13.4
## 4 Arkansas 13.4
## 5 Hawaii 13.0
## 6 Alaska 12.6
## 7 Wyoming 12.4
## 8 Vermont 12.2
## 9 North Carolina 12.2
## 10 Iowa 12.0
## # … with 40 more rows
# Type.of.breach vs. severity_index5
ggplot(data=df, aes(x = Type.of.breach, y = severity_index5)) +
geom_boxplot()
# breach_type_gB vs. severity_index5
ggplot(data=df, aes(x = breach_type_gB, y = severity_index5)) +
geom_boxplot()
# breach_type_gC vs. severity_index5
ggplot(data=df, aes(x = breach_type_gC, y = severity_index5)) +
geom_boxplot()
# Type.of.organization vs. severity_index5
ggplot(data=df, aes(x = Type.of.organization, y = severity_index5)) +
geom_boxplot()
# org_group vs. severity_index5
ggplot(data=df, aes(x = org_group, y = severity_index5)) +
geom_boxplot()
# region_B vs. severity_index5
ggplot(data=df, aes(x = region_B, y = severity_index5)) +
geom_boxplot()
# age_of_Law vs. severity_index5
ggplot(aes(x = age_of_Law, y = severity_index5), data = df) +
geom_point(alpha = 0.5)
# correlation test
cor.test(df$age_of_Law, df$severity_index5)
##
## Pearson's product-moment correlation
##
## data: df$age_of_Law and df$severity_index5
## t = -0.29505, df = 5353, p-value = 0.768
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03081402 0.02275443
## sample estimates:
## cor
## -0.004032687